In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
In [2]:
path = '../Clean Data'
X_fn = 'X.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)
X = pd.read_csv(X_path)
y = pd.read_csv(y_path)
In [3]:
X.head()
Out[3]:
Rename the cluster column to just cluster
. This won't be needed once we export from the group classification with the correct column name
In [3]:
X.rename(columns={'cluster_id_6':'cluster'}, inplace=True)
In [4]:
for fuel in ['All coal', 'Lignite', 'Subbituminous']:
X.loc[:,fuel] = X.loc[:,fuel].values/X.loc[:,'NG Price ($/mcf)'].values
X.drop('NG Price ($/mcf)', axis=1, inplace=True)
In [5]:
cluster_ids = X['cluster'].unique()
for cluster in cluster_ids:
X['cluster_{}'.format(cluster)] = np.eye(len(cluster_ids))[X['cluster'],cluster]
In [6]:
X.head()
Out[6]:
In [8]:
X.tail()
Out[8]:
In [9]:
y.tail()
Out[9]:
In [7]:
X_cols = ['nameplate_capacity', 'GROSS LOAD (MW)', 'ERCOT Load, MW',
'Total Wind Installed, MW', 'Total Wind Output, MW', 'Net Load Change (MW)',
'All coal', 'Lignite', 'Subbituminous']
X_cluster_cols = ['cluster_{}'.format(cluster) for cluster in cluster_ids]
X_clean = X.loc[:,X_cols+X_cluster_cols]
X_clean.fillna(0, inplace=True)
y_clean = y.loc[:,'Gen Change (MW)']
y_clean.fillna(0, inplace=True)
In [11]:
print X_clean.shape
print y_clean.shape
In [8]:
X_train = X_clean.loc[(X['Year']<2012),:]
y_train = y_clean.loc[(X['Year']<2012)]
X_va = X_clean.loc[X['Year'].isin([2012, 2013]),:]
y_va = y_clean.loc[X['Year'].isin([2012, 2013])]
X_test = X_clean.loc[X['Year']>2013,:]
y_test = y_clean.loc[X['Year']>2013]
Somehow we're missing 2 records from X_va
In [13]:
print X_va.shape, y_va.shape
Need scaled versions of the X data for some of the models
In [9]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_va_scaled = StandardScaler().fit_transform(X_va)
X_test_scaled = StandardScaler().fit_transform(X_test)
Check size of all arrays
In [15]:
print X_train_scaled.shape, y_train.shape
print X_va_scaled.shape, y_va.shape
print X_test_scaled.shape, y_test.shape
In [10]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
Out[10]:
In [11]:
lm.score(X_va_scaled, y_va)
Out[11]:
In [12]:
y_pr = lm.predict(X_va_scaled)
In [13]:
y_va.values.shape, y_pr.shape, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values.shape
Out[13]:
In [14]:
y_lm_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
[y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
# y_lm_resids['y_pr'] = y_pr
# y_lm_resids['cluster'] = X.loc[:,'cluster']
In [21]:
y_lm_resids.head()
Out[21]:
In [15]:
y_lm_resids.loc[:,'residuals'] = y_lm_resids.loc[:,'y_pr'] - y_lm_resids.loc[:,'Gen Change (MW)']
In [36]:
with sns.axes_style('whitegrid'):
g = sns.FacetGrid(y_lm_resids, hue='cluster', col='cluster',
col_wrap=3)
g.map(plt.scatter, 'y_pr', 'residuals', s=5, alpha=0.3)
g.set_xlabels(size=15)
g.set_ylabels(size=15)
plt.savefig('OLS residuals.pdf')
Out[36]:
Out[36]:
Out[36]:
In [20]:
svm = LinearSVR()
In [25]:
parameters = {'C':np.logspace(-5, 3, num=15)}
In [26]:
lm = GridSearchCV(svm, parameters, n_jobs=-1, verbose=3)
Run the LinearSVR with gridsearch over the 15 parameter values. GridSearchCV does 3-fold CV by default.
In [27]:
results = lm.fit(X_train_scaled, y_train)
In [25]:
param_range = np.logspace(-5, 3, num=13)
train_scores, valid_scores = validation_curve(LinearSVR(), X_train_scaled, y_train,
"C", param_range, n_jobs=-1)
In [26]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
plt.title("Validation Curve - LinearSVR", size=15)
plt.xlabel("C", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, .2)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, valid_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('LinearSVR validation curve.pdf', bbox_inches='tight')
Out[26]:
Out[26]:
Out[26]:
Out[26]:
Out[26]:
Out[26]:
Out[26]:
Out[26]:
Out[26]:
In [28]:
# xLen = len(X_train_scaled)
# tSize = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
train_sizes, train_scores, valid_scores = learning_curve(LinearSVR(), X_train_scaled,
y_train, n_jobs=-1)
In [33]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
plt.title("Learning Curve - LinearSVR", size=15)
plt.xlabel("Training Size", size=15)
plt.ylabel("Score", size=15)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="darkorange")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(train_sizes, train_scores_mean, 'o-', color="darkorange",
label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="navy",
label="Cross-validation score")
plt.legend(loc="best")
plt.savefig('LinearSVR learning curve.pdf', bbox_inches='tight')
Out[33]:
Out[33]:
Out[33]:
Out[33]:
Out[33]:
Out[33]:
Out[33]:
Out[33]:
cv_results_
returns a dictionary with all of the results and parameters
In [28]:
results.cv_results_
Out[28]:
In [29]:
test_score = results.cv_results_['mean_test_score']
train_score = results.cv_results_['mean_train_score']
In [30]:
C = [results.cv_results_['params'][x]['C'] for x in range(15)]
In [31]:
C
Out[31]:
Plot the score for each value of C
In [32]:
plt.semilogx(C, test_score, C, train_score)
plt.legend(['test_score', 'train_score'])
Out[32]:
In [33]:
X_train_scaled.shape
Out[33]:
In [38]:
idx = np.random.choice(np.arange(len(X_train_scaled)), size=int(len(X_train_scaled)*0.5), replace=False)
In [39]:
lm = SVR()
In [40]:
lm.fit(X_train_scaled[idx], y_train[idx])
Out[40]:
In [41]:
lm.score(X_va_scaled, y_va)
Out[41]:
In [69]:
xLen = len(X_train_scaled)
tSize = [.1, .2, .3, .4, .5]
train_sizes, train_scores, valid_scores = learning_curve(SVR(), X_train_scaled, y_train, train_sizes = tSize, n_jobs = -1, verbose = 3)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)
plt.grid()
plt.title("Validation Curve - SVR")
plt.fill_between(tSize, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="darkorange")
plt.fill_between(tSize, valid_scores_mean - valid_scores_std,
valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(tSize, train_scores_mean, 'o-', color="darkorange",
label="Training score")
plt.plot(tSize, valid_scores_mean, 'o-', color="navy",
label="Cross-validation score")
plt.legend(loc="best")
plt.show
Out[69]: